# Load libraries
library(plyr)
library(tidyverse)
library(readxl)
library(tidylog)
library(RCurl)
library(janitor)
library(patchwork)
library(devtools)
library(viridis)
library(nngeo)
library(conflicted)
# Source code for map plots
source_url("https://raw.githubusercontent.com/maxlindmark/cod-interactions/main/R/functions/map-plot.R")
home <- here::here()
# Because I loaded plyr (tidylog uses dplyr)
conflicts_prefer(tidylog::mutate)
conflicts_prefer(tidylog::summarise)
conflicts_prefer(tidylog::filter)
conflicts_prefer(tidylog::distinct)
conflicts_prefer(tidylog::group_by)
conflicts_prefer(tidylog::drop_na)
conflicts_prefer(tidylog::ungroup)
conflicts_prefer(tidylog::left_join)
conflicts_prefer(tidylog::select)
conflicts_prefer(dplyr::arrange)
# Load coordinate-changing functions
# Function to change coordinates to something more useful
format.position <- function(x){
sign.x <- sign(x)
x <- abs(x)
x <- ifelse(nchar(x)==3, paste("0",x,sep=""), x)
x <- ifelse(nchar(x)==2, paste("00",x,sep=""), x)
x <- ifelse(nchar(x)==1, paste("000",x,sep=""), x)
dec.x <- as.numeric(paste(substring(x,1,2)))+as.numeric(paste(substring(x,3,4)))/60
dec.x <- sign.x*dec.x
}Join CTD and stomach data
‘Tidy datasets are all alike but every messy dataset is messy in its own way’
Here I will try and join CTD data to trawl data using the new key smhi_serial_no that was recently added to the trawl database. Essentially all surveys (quarter 1 and 4 every year) has a different file format.
Load packages
Load data
First read the stomach + trawl data. Take the unique haul id because that’s all we need
d <- read_csv(paste0(home, "/data/clean/aggregated_stomach_data.csv")) %>%
dplyr::select(lon, lat, X, Y, year, quarter, smhi_serial_no, haul_id, bottom_depth) %>%
distinct(haul_id, .keep_all = TRUE) %>%
mutate(q_year = paste(quarter, year, sep = "_"),
smhi_serial_no_all = paste(q_year, smhi_serial_no, sep = "_")) %>%
as.data.frame()Check if there are any NA’s in the one key we have to join the data
d %>%
group_by(year, quarter) %>%
summarise(sum_na = sum(is.na(smhi_serial_no))) %>%
as.data.frame() year quarter sum_na
1 2015 4 0
2 2016 1 0
3 2016 4 0
4 2017 1 0
5 2017 4 0
6 2018 1 0
7 2018 4 0
8 2019 4 0
9 2020 1 0
10 2020 4 7
11 2021 1 22
12 2021 4 13
13 2022 1 12
14 2022 4 14
Ok, so in addition to all the peculiarities with the data structure (sometimes haul is given in as a name of a tab, sometimes filename, sometimes cell in a non-R friendly ctd output), and the raw CTD data itself, it doesn’t look very promising that there are many missing keys in recent data.
I will anyway try with an early survey wihout any NA’s in the key.
Start with Q4 2015
# Nov 2015
setwd(paste0(home, "/data/bits-oxygen/15_11/CTD_SBE911/CNV/for_r/"))
filenames <- list.files(pattern = "*.xlsx")
filenames [1] "26da.2015.12.1.xlsx" "26da.2015.12.10.xlsx" "26da.2015.12.11.xlsx"
[4] "26da.2015.12.13.xlsx" "26da.2015.12.14.xlsx" "26da.2015.12.17.xlsx"
[7] "26da.2015.12.18.xlsx" "26da.2015.12.20.xlsx" "26da.2015.12.22.xlsx"
[10] "26da.2015.12.24.xlsx" "26da.2015.12.26.xlsx" "26da.2015.12.27.xlsx"
[13] "26da.2015.12.3.xlsx" "26da.2015.12.30.xlsx" "26da.2015.12.33.xlsx"
[16] "26da.2015.12.34.xlsx" "26da.2015.12.36.xlsx" "26da.2015.12.38.xlsx"
[19] "26da.2015.12.40.xlsx" "26da.2015.12.42.xlsx" "26da.2015.12.44.xlsx"
[22] "26da.2015.12.46.xlsx" "26da.2015.12.48.xlsx" "26da.2015.12.5.xlsx"
[25] "26da.2015.12.50.xlsx" "26da.2015.12.52.xlsx" "26da.2015.12.54.xlsx"
[28] "26da.2015.12.56.xlsx" "26da.2015.12.58.xlsx" "26da.2015.12.7.xlsx"
q4_15 <- purrr::map_df(filenames,
~read_excel(.x, skip = 385, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x))
q4_15 <- q4_15 %>%
mutate(filename = str_remove(filename, ".xlsx")) %>%
mutate(smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename))) %>%
dplyr::select(temp, oxy, smhi_serial_no, lat, lon, depth) %>%
dplyr::rename(lat_ctd = lat,
lon_ctd = lon)
# Now see how the unique values for the key ("smhi_serial_no") for trawl data and CTD data
d %>%
filter(year == 2015, quarter == 4) %>%
distinct(smhi_serial_no) %>%
arrange(smhi_serial_no) %>%
pull() [1] 1 3 5 7 10 11 14 17 18 20 30 33 36 48 50 52
q4_15 %>%
distinct(smhi_serial_no) %>%
arrange(smhi_serial_no) %>%
pull() [1] 1 3 5 7 10 11 13 14 17 18 20 22 24 26 27 30 33 34 36 38 40 42 44 46 48
[26] 50 52 54 56 58
# Explore the data a bit, because we have CTD data for many depths per trawl station
q4_15 %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2015, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxy, depth, color = as.factor(smhi_serial_no))) +
scale_y_continuous(trans = "reverse") +
scale_color_viridis(discrete = TRUE) +
geom_point()q4_15 %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2015, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxy, depth)) +
scale_y_continuous(trans = "reverse") +
geom_point(size = 0.5) +
facet_wrap(~smhi_serial_no)# Before joining, we need to summarise the oxygen a bit. I *assume* that the deepest point is the bottom. I will average oxygen for the deepest 3 metres.
q4_15_trim <- q4_15 %>%
group_by(smhi_serial_no) %>%
mutate(ctd_depth = ifelse(depth > max(depth) -3, "near bottom", "pelagic")) %>%
ungroup() %>%
filter(ctd_depth == "near bottom") %>%
group_by(smhi_serial_no) %>%
summarise(oxy = mean(oxy),
depth = mean(depth),
temp = mean(temp)) %>%
ungroup()
# Join in oxygen data!
d_q4_15 <- d %>%
filter(year == 2015, quarter == 4) %>%
left_join(q4_15_trim, by = "smhi_serial_no")
# Check oxygen values
# Seems low? This is from the BITS manual:
# Selected hauls should be omitted in the case when the results of at least two stations in the same depth layers have revealed that fish not appeared in the zone which was covered by the net opening and when hydrographical observations have revealed Manual of the Baltic International Trawl Surveys (BITS) 5 that oxygen content is less that 1.5 ml/l in the layer of vertical net opening
ggplot(d_q4_15, aes(smhi_serial_no, oxy)) +
geom_point() +
geom_hline(yintercept = 1.5, alpha = 0.5, linetype = 2)ggplot(d_q4_15, aes(smhi_serial_no, temp)) +
geom_point()# Check also the coordinates match
ctd_q415 <- q4_15_trim
trawl_q415 <- d %>% filter(year == 2015, quarter == 4)
# https://stackoverflow.com/questions/71959927/spatial-join-two-data-frames-by-nearest-feature-and-date-in-r
trawl_sf_q415 <- trawl_q415 %>% st_as_sf(coords = c("lon", "lat"), remove = FALSE) %>%
st_set_crs(4326)
ctd_q415 <- q4_15_trim %>%
# Add back in coord since they were removed after summarising
left_join(q4_15 %>%
distinct(smhi_serial_no, .keep_all = TRUE) %>%
dplyr::select(lat_ctd, lon_ctd, smhi_serial_no), by = "smhi_serial_no") %>%
st_as_sf(coords = c("lon_ctd", "lat_ctd"), remove = FALSE) %>%
st_set_crs(4326)
# Plot on map
st_connect(trawl_sf_q415, ctd_q415) %>%
mapview::mapview() +
mapview::mapview(trawl_sf_q415, color = 'tomato', col.regions = 'tomato') +
mapview::mapview(ctd_q415, color = 'steelblue', col.regions = 'steelblue')Calculating nearest IDs
|
| | 0%
|
|==== | 6%
|
|========= | 12%
|
|============= | 19%
|
|================== | 25%
|
|====================== | 31%
|
|========================== | 38%
|
|=============================== | 44%
|
|=================================== | 50%
|
|======================================= | 56%
|
|============================================ | 62%
|
|================================================ | 69%
|
|==================================================== | 75%
|
|========================================================= | 81%
|
|============================================================= | 88%
|
|================================================================== | 94%
|
|======================================================================| 100%
Calculating lines
Seems to work! Now try a more recent year with NAs where we need to match by coordinates.. Let’s do Q4 2020. This specific survey comes in single file where station is not indicated by tab name, but a string in somewhat consistent position in the excel file. Each tab does have a 3 digit number that could be the serial no, but after looking at the serial numbers in the trawl data I see that it cannot be. They are in the 700-range.
d %>%
filter(year == 2020 & quarter == 4) %>%
distinct(smhi_serial_no) %>% pull() [1] 704 710 730 NA 734 719 735 737 703 715 716
Another critical issue here is that in this specific cell the key is placed within a string. Like this ** Station: 6.5 Faro 0716, where 0716 is the string. This is possible to retrieve though. However, for some tabs, only the station name is given, not the number. Like this: ** Station: 12se Nar. This will result in an NA. This means we now have two sources for NA in the matching columns, and I will therefore try to join by coordinates instead. HOWEVER, the coordinates are not stored in a column, but as a part of random string I need to find in the raw data… Hence the multiple lapply(tab_names_trim, function(x) read_excel(path = xl_data with different “skips”. Finally, not that the coordinates are not in decimal degrees but decimal minutes …
In this code, I read all tabs containing CTD data and put them in a list. Then I loop through all data frames in the list and clean them up.
setwd(home)
xl_data <- paste0(home, "/data/bits-oxygen/20_11/CTD_BITS-06.xlsx")
tab_names <- excel_sheets(path = xl_data)
tab_names_trim <- tab_names[grepl("\\_SBE", tab_names)]
# Put all tabs in a list. Skip different # rows to get either raw data, id or lat/lon
list_vars <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 12))
list_all_id <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x))
list_all_lat <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 8))
list_all_lon <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 9))
# list_all contains all relevant tabs. Loop through all these
vars_list <- list()
for(i in 1:length(tab_names_trim)){
# Get the actual CTD data
vars <- list_vars[[i]] %>%
dplyr::select(contains(c("deg C", "[true depth, m]", "[ml/l]"))) %>%
dplyr::rename(temperature = 1,
depth = 2,
oxygen = 3)
# Now get the ID
id <- names(list_all_id[[i]])[1]
id <- strsplit(id, " ")[[1]]
id <- as.numeric(id[length(id)])
vars$smhi_serial_no <- id
# Lat and lon for cross-checking
vars$lat <- names(list_all_lat[[i]])[1]
vars$lon <- names(list_all_lon[[i]])[1]
# Clean up data!
#lat.d.d = format.position(lat.dm.m)
vars <- vars %>%
mutate(temperature = as.numeric(temperature),
depth = as.numeric(depth),
oxygen = as.numeric(oxygen),
smhi_serial_no = as.numeric(smhi_serial_no)) %>%
mutate(lat = str_remove(lat, "\\** Latitude: ")) %>%
mutate(lat = str_remove(lat, " N")) %>%
mutate(lat = str_remove(lat, " ")) %>%
mutate(lat = format.position(as.numeric(lat))) %>%
mutate(lat = as.numeric(lat)) %>%
mutate(lon = str_remove(lon, "\\** Longitude: ")) %>%
mutate(lon = str_remove(lon, " E")) %>%
mutate(lon = str_remove(lon, " ")) %>%
mutate(lon = format.position(as.numeric(lon))) %>%
mutate(lon = as.numeric(lon)) %>%
dplyr::rename(lat_ctd = lat, lon_ctd = lon, depth_ctd = depth)
vars_list[[i]] <- vars
}
q4_20 <- bind_rows(vars_list)First explore the data
q4_20 %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2020, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxygen, depth_ctd, color = as.factor(smhi_serial_no))) +
scale_y_continuous(trans = "reverse") +
scale_color_viridis(discrete = TRUE) +
geom_point()q4_20 %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2020, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxygen, depth_ctd)) +
scale_y_continuous(trans = "reverse") +
geom_point(size = 0.5) +
facet_wrap(~smhi_serial_no)Again, lots of NA in the smhi_serial_no. Make a new station ID by using coordinates instead.
q4_20 %>%
arrange(desc(depth_ctd))# A tibble: 2,993 × 6
temperature depth_ctd oxygen smhi_serial_no lat_ctd lon_ctd
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -2.04 2122. 15.1 713 57.9 19.4
2 -8.91 1886. -140. 713 57.9 19.4
3 -13.2 1466. -137. 713 57.9 19.4
4 -8.90 1140. -126. 713 57.9 19.4
5 -2.03 886. 12.8 713 57.9 19.4
6 2.09 689. 10.9 713 57.9 19.4
7 4.56 536. 9.96 713 57.9 19.4
8 6.04 417. 9.42 713 57.9 19.4
9 6.94 324. 9.07 713 57.9 19.4
10 7.47 252. 8.87 713 57.9 19.4
# ℹ 2,983 more rows
q4_20_id <- q4_20 %>%
mutate(id = paste(round(lat_ctd, digits = 3), round(lon_ctd, digits = 3), sep = "_"))
q4_20_id %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2020, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxygen, depth_ctd, color = as.factor(id))) +
scale_y_continuous(trans = "reverse") +
scale_color_viridis(discrete = TRUE) +
geom_point()q4_20_id %>%
filter(smhi_serial_no %in% c(unique(filter(d, year == 2020, quarter == 4)$smhi_serial_no))) %>%
ggplot(aes(oxygen, depth_ctd)) +
scale_y_continuous(trans = "reverse") +
geom_point(size = 0.5) +
facet_wrap(~id)Still an NA but that’s all we can do. Repeat the processes of summarizing oxygen near bottom.
q4_20_trim <- q4_20_id %>%
group_by(id) %>%
mutate(ctd_depth = ifelse(depth_ctd > max(depth_ctd) -3, "near bottom", "pelagic")) %>%
ungroup() %>%
filter(ctd_depth == "near bottom") %>%
group_by(id) %>%
summarise(oxy = mean(oxygen),
depth = mean(depth_ctd),
temp = mean(temperature)) %>%
ungroup()
# Inspect
q4_20_trim %>% as.data.frame() id oxy depth temp
1 55.183_13.583 4.37845000 39.65850 12.471833
2 55.25_13.9 6.68965000 36.65450 10.718050
3 55.45_14.567 2.63443333 57.92550 12.271250
4 55.667_14.5 3.19096667 45.80550 10.971283
5 55.683_14.367 5.60985000 39.73350 10.869200
6 55.7_14.417 6.79985000 35.68600 10.869567
7 55.817_15.5 2.99953333 45.28550 10.696700
8 55.9_15.6 3.83463333 47.83317 10.678850
9 56.517_16.833 0.10241667 58.97433 5.675767
10 56.717_17 0.35266667 61.00600 5.593233
11 56.7_17.017 0.12108333 64.54700 5.604883
12 56.967_17.2 0.11041667 68.58183 5.691567
13 56.983_17.917 0.08928333 66.55133 5.874167
14 57.083_18.917 0.08175000 85.23950 6.469133
15 57.167_18.883 4.63080000 50.87600 5.836750
16 57.233_19.05 0.10833333 78.18417 6.066900
17 57.333_19.233 0.07191667 97.88750 6.804233
18 57.3_19.1 0.38640000 73.64500 5.954150
19 57.667_19.383 0.09115000 74.14717 6.149417
20 57.75_19.467 0.08200000 78.69850 6.290733
21 57.867_19.517 0.60720000 68.62767 5.967217
22 57.9_19.45 15.08860000 2121.50400 -2.043100
23 58.033_19.5 0.07616667 100.96650 6.330050
24 58.083_19.45 0.09268333 84.78933 6.235283
25 NA_NA 0.09283333 75.67717 5.843117
Ok, so here we also see that in one of the stations, the depth measure is way off, and this is verified in the excel file as well.
Remove this.
q4_20_trim <- q4_20_trim %>% filter(depth < 300)Here we would have joined this trimmed CTD data with the trawl data, but because of the lack of key, I will instead have to do it by matching nearest point.
# https://stackoverflow.com/questions/59621797/evaluating-the-closest-distance-from-one-point-between-multiple-options
trawl_sf <- d %>%
filter(year == 2020 & quarter == 4) %>%
st_as_sf(coords = c("lon", "lat"), remove = FALSE) %>%
st_set_crs(4326)
ctd_sf <- q4_20_trim %>%
separate(id, sep = "_", into = c("lat", "lon"), convert = TRUE) %>%
drop_na(lat) %>%
drop_na(lon) %>%
st_as_sf(coords = c("lon", "lat"), remove = FALSE) %>%
st_set_crs(4326)
# Join df with df1, based on the nearest feature:
nrow(trawl_sf)[1] 17
nrow(ctd_sf)[1] 23
df_near <- st_join(trawl_sf, ctd_sf, join = st_nearest_feature)
str(df_near)Classes 'sf' and 'data.frame': 17 obs. of 17 variables:
$ lon.x : num 17 19.2 13.6 13.9 13.9 ...
$ lat.x : num 57.4 57.6 55.2 55.3 55.2 ...
$ X : num 621 749 411 431 433 ...
$ Y : num 6364 6392 6118 6127 6123 ...
$ year : num 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
$ quarter : num 4 4 4 4 4 4 4 4 4 4 ...
$ smhi_serial_no : num 704 710 730 NA NA NA 734 719 735 NA ...
$ haul_id : chr "2020_4_213" "2020_4_218" "2020_4_237" "2020_4_238" ...
$ bottom_depth : num 552 465 433 371 381 565 464 484 343 381 ...
$ q_year : chr "4_2020" "4_2020" "4_2020" "4_2020" ...
$ smhi_serial_no_all: chr "4_2020_704" "4_2020_710" "4_2020_730" "4_2020_NA" ...
$ lat.y : num 57 57.7 55.2 55.2 55.2 ...
$ lon.y : num 17.2 19.4 13.6 13.9 13.9 ...
$ oxy : num 0.1104 0.0911 4.3784 6.6896 6.6896 ...
$ depth : num 68.6 74.1 39.7 36.7 36.7 ...
$ temp : num 5.69 6.15 12.47 10.72 10.72 ...
$ geometry :sfc_POINT of length 17; first list element: 'XY' num 17 57.4
- attr(*, "sf_column")= chr "geometry"
- attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "names")= chr [1:16] "lon.x" "lat.x" "X" "Y" ...
# Check distance in metres
ctd_sf %>%
cbind(
trawl_sf[st_nearest_feature(ctd_sf, trawl_sf),]) %>%
mutate(dist = st_distance(geometry, geometry.1, by_element = T)) %>%
arrange(desc(dist)) %>%
as.data.frame() lat lon oxy depth temp lon.1 lat.1 X
14 56.517 16.833 0.10241667 58.97433 5.675767 16.90000 57.35000 614.3265
14.2 56.700 17.017 0.12108333 64.54700 5.604883 16.90000 57.35000 614.3265
14.1 56.717 17.000 0.35266667 61.00600 5.593233 16.90000 57.35000 614.3265
8 56.983 17.917 0.08928333 66.55133 5.874167 18.85000 57.16667 732.7606
1 56.967 17.200 0.11041667 68.58183 5.691567 17.01667 57.40000 621.1803
2 57.667 19.383 0.09115000 74.14717 6.149417 19.16667 57.60000 748.9352
17 57.750 19.467 0.08200000 78.69850 6.290733 19.50000 57.86667 766.8602
13.1 57.333 19.233 0.07191667 97.88750 6.804233 19.05000 57.30000 743.9612
8.1 57.083 18.917 0.08175000 85.23950 6.469133 18.85000 57.16667 732.7606
13 57.233 19.050 0.10833333 78.18417 6.066900 19.05000 57.30000 743.9612
6 55.450 14.567 2.63443333 57.92550 12.271250 14.46667 55.45000 466.2665
16 58.033 19.500 0.07616667 100.96650 6.330050 19.43333 58.06667 761.4486
12 55.817 15.500 2.99953333 45.28550 10.696700 15.56667 55.83333 535.4935
11 55.900 15.600 3.83463333 47.83317 10.678850 15.56667 55.86667 535.4631
4 55.250 13.900 6.68965000 36.65450 10.718050 13.91667 55.28333 431.1914
10 55.700 14.417 6.79985000 35.68600 10.869567 14.36667 55.70000 460.1953
13.2 57.300 19.100 0.38640000 73.64500 5.954150 19.05000 57.30000 743.9612
3 55.183 13.583 4.37845000 39.65850 12.471833 13.60000 55.20000 410.8932
16.1 58.083 19.450 0.09268333 84.78933 6.235283 19.43333 58.06667 761.4486
8.2 57.167 18.883 4.63080000 50.87600 5.836750 18.85000 57.16667 732.7606
7 55.667 14.500 3.19096667 45.80550 10.971283 14.50000 55.68333 468.5617
9 55.683 14.367 5.60985000 39.73350 10.869200 14.35000 55.68333 459.1304
17.1 57.867 19.517 0.60720000 68.62767 5.967217 19.50000 57.86667 766.8602
Y year quarter smhi_serial_no haul_id bottom_depth q_year
14 6357.944 2020 4 703 2020_4_212 435 4_2020
14.2 6357.944 2020 4 703 2020_4_212 435 4_2020
14.1 6357.944 2020 4 703 2020_4_212 435 4_2020
8 6342.514 2020 4 719 2020_4_227 484 4_2020
1 6363.710 2020 4 704 2020_4_213 552 4_2020
2 6391.826 2020 4 710 2020_4_218 465 4_2020
17 6422.747 2020 4 716 2020_4_224 69 4_2020
13.1 6358.042 2020 4 NA 2020_4_229 677 4_2020
8.1 6342.514 2020 4 719 2020_4_227 484 4_2020
13 6358.042 2020 4 NA 2020_4_229 677 4_2020
6 6144.998 2020 4 NA 2020_4_240 565 4_2020
16 6444.723 2020 4 715 2020_4_223 754 4_2020
12 6187.676 2020 4 NA 2020_4_245 375 4_2020
11 6191.385 2020 4 737 2020_4_244 502 4_2020
4 6126.856 2020 4 NA 2020_4_238 371 4_2020
10 6172.873 2020 4 NA 2020_4_243 381 4_2020
13.2 6358.042 2020 4 NA 2020_4_229 677 4_2020
3 6117.942 2020 4 730 2020_4_237 433 4_2020
16.1 6444.723 2020 4 715 2020_4_223 754 4_2020
8.2 6342.514 2020 4 719 2020_4_227 484 4_2020
7 6170.950 2020 4 734 2020_4_241 464 4_2020
9 6171.028 2020 4 735 2020_4_242 343 4_2020
17.1 6422.747 2020 4 716 2020_4_224 69 4_2020
smhi_serial_no_all dist geometry
14 4_2020_703 92851.403 [m] POINT (16.833 56.517)
14.2 4_2020_703 72732.099 [m] POINT (17.017 56.7)
14.1 4_2020_703 70752.168 [m] POINT (17 56.717)
8 4_2020_719 60169.007 [m] POINT (17.917 56.983)
1 4_2020_704 49478.294 [m] POINT (17.2 56.967)
2 4_2020_710 14922.439 [m] POINT (19.383 57.667)
17 4_2020_716 13140.927 [m] POINT (19.467 57.75)
13.1 4_2020_NA 11623.025 [m] POINT (19.233 57.333)
8.1 4_2020_719 10162.682 [m] POINT (18.917 57.083)
13 4_2020_NA 7461.431 [m] POINT (19.05 57.233)
6 4_2020_NA 6348.684 [m] POINT (14.567 55.45)
16 4_2020_715 5436.753 [m] POINT (19.5 58.033)
12 4_2020_NA 4556.868 [m] POINT (15.5 55.817)
11 4_2020_737 4257.394 [m] POINT (15.6 55.9)
4 4_2020_NA 3859.228 [m] POINT (13.9 55.25)
10 4_2020_NA 3164.719 [m] POINT (14.417 55.7)
13.2 4_2020_NA 3014.117 [m] POINT (19.1 57.3)
3 4_2020_730 2180.378 [m] POINT (13.583 55.183)
16.1 4_2020_715 2068.020 [m] POINT (19.45 58.083)
8.2 4_2020_719 1996.844 [m] POINT (18.883 57.167)
7 4_2020_734 1818.487 [m] POINT (14.5 55.667)
9 4_2020_735 1069.982 [m] POINT (14.367 55.683)
17.1 4_2020_716 1009.671 [m] POINT (19.517 57.867)
geometry.1
14 POINT (16.9 57.35)
14.2 POINT (16.9 57.35)
14.1 POINT (16.9 57.35)
8 POINT (18.85 57.16667)
1 POINT (17.01667 57.4)
2 POINT (19.16667 57.6)
17 POINT (19.5 57.86667)
13.1 POINT (19.05 57.3)
8.1 POINT (18.85 57.16667)
13 POINT (19.05 57.3)
6 POINT (14.46667 55.45)
16 POINT (19.43333 58.06667)
12 POINT (15.56667 55.83333)
11 POINT (15.56667 55.86667)
4 POINT (13.91667 55.28333)
10 POINT (14.36667 55.7)
13.2 POINT (19.05 57.3)
3 POINT (13.6 55.2)
16.1 POINT (19.43333 58.06667)
8.2 POINT (18.85 57.16667)
7 POINT (14.5 55.68333)
9 POINT (14.35 55.68333)
17.1 POINT (19.5 57.86667)
# Here are the points on a map. Some trawl stations that are far from a CTD reading
st_connect(trawl_sf, ctd_sf) %>%
mapview::mapview() +
mapview::mapview(trawl_sf, color = 'tomato', col.regions = 'tomato') +
mapview::mapview(ctd_sf, color = 'steelblue', col.regions = 'steelblue')Calculating nearest IDs
|
| | 0%
|
|==== | 6%
|
|======== | 12%
|
|============ | 18%
|
|================ | 24%
|
|===================== | 29%
|
|========================= | 35%
|
|============================= | 41%
|
|================================= | 47%
|
|===================================== | 53%
|
|========================================= | 59%
|
|============================================= | 65%
|
|================================================= | 71%
|
|====================================================== | 76%
|
|========================================================== | 82%
|
|============================================================== | 88%
|
|================================================================== | 94%
|
|======================================================================| 100%
Calculating lines
# Here we can see which values of CTD are matched to trawl data
ggplot() +
geom_sf(data = trawl_sf, aes(color = "trawl"), size = 3, alpha = .6) +
geom_sf(data = ctd_sf, aes(color = "ctd"), alpha = .6) +
scale_color_manual(values = c("steelblue", "tomato"), name = "Data type") +
geom_sf(data = st_connect(trawl_sf, ctd_sf), linewidth = 0.2)Calculating nearest IDs
|
| | 0%
|
|==== | 6%
|
|======== | 12%
|
|============ | 18%
|
|================ | 24%
|
|===================== | 29%
|
|========================= | 35%
|
|============================= | 41%
|
|================================= | 47%
|
|===================================== | 53%
|
|========================================= | 59%
|
|============================================= | 65%
|
|================================================= | 71%
|
|====================================================== | 76%
|
|========================================================== | 82%
|
|============================================================== | 88%
|
|================================================================== | 94%
|
|======================================================================| 100%
Calculating lines
Preliminary conclusions:
- Not all trawl data have a CTD measurement, we’d need to discard stomach data, and some CTD measurements are nonsens
- It is extremely time-consuming to prepare the data, essentially it needs to be done by each survey, and sometimes within each file (station) within a survey
- At which depths do we summarise oxygen data? We cannot no for sure it’s the bottom
- How far from trawl locaiton is CTD ok to use?
Do it for all years!
Here I will redo everything but with only the essential code. I can do until 18_11 using the approach in the first section.
Note, it’s not 100% the same code, because in some surveys, the row of data starts at a different line, and there are also different amounts of columns…
# Nov 2015
setwd(paste0(home, "/data/bits-oxygen/15_11/CTD_SBE911/CNV/for_r/"))
filenames <- list.files(pattern = "*.xlsx")
q4_15 <- purrr::map_df(filenames,
~read_excel(.x, skip = 385, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(4, 2015, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Feb 2016
setwd(paste0(home, "/data/bits-oxygen/16_02/"))
filenames <- list.files(pattern = "*.xlsx")
q1_16 <- purrr::map_df(filenames,
~read_excel(.x, skip = 368, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2016, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Nov 2016
setwd(paste0(home, "/data/bits-oxygen/16_11/"))
filenames <- list.files(pattern = "*.xlsx")
q4_16 <- purrr::map_df(filenames,
~read_excel(.x, skip = 368, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(4, 2016, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Feb 2017
setwd(paste0(home, "/data/bits-oxygen/17_02/"))
filenames <- list.files(pattern = "*.xlsx")
q1_17 <- purrr::map_df(filenames,
~read_excel(.x, skip = 385, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2017, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Nov 2017
setwd(paste0(home, "/data/bits-oxygen/17_11/"))
filenames <- list.files(pattern = "*.xlsx")
q4_17 <- purrr::map_df(filenames,
~read_excel(.x, skip = 375, col_types = "guess") %>%
dplyr::select(4, 12, 19, 22, 23) %>% # Note: different indicies here
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x) %>%
mutate(temp = as.numeric(temp),
oxy = as.numeric(oxy),
lat = as.numeric(lat),
lon = as.numeric(lon))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(4, 2017, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Feb 2018
setwd(paste0(home, "/data/bits-oxygen/18_02/"))
filenames <- list.files(pattern = "*.xlsx")
q1_18 <- purrr::map_df(filenames,
~read_excel(.x, skip = 375, col_types = "guess") %>%
dplyr::select(4, 12, 19, 22, 23) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x) %>%
mutate(temp = as.numeric(temp),
oxy = as.numeric(oxy),
lat = as.numeric(lat),
lon = as.numeric(lon))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2018, sep = "_"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)))
# Nov 2018
setwd(paste0(home, "/data/bits-oxygen/18_11/"))
filenames <- list.files(pattern = "*.xlsx")
q4_18 <- purrr::map_df(filenames,
~read_excel(.x, skip = 377, col_types = "guess") %>%
dplyr::select(4, 12, 19, 23, 24) %>%
dplyr::rename(temp = 1,
oxy = 2,
depth = 3,
lat = 4,
lon = 5) %>%
mutate(filename = .x) %>%
mutate(temp = as.numeric(temp),
oxy = as.numeric(oxy),
depth = as.numeric(depth),
lat = as.numeric(lat),
lon = as.numeric(lon))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
smhi_serial_no = as.numeric(sub(".*\\.(.*)", "\\1", filename)),
q_year = paste(4, 2018, sep = "_"))
setwd(home)
# Combine all these data
oxy_old_ctd <- bind_rows(q4_15,
q1_16, q4_16,
q1_17, q4_17,
q1_18, q4_18) %>%
drop_na(oxy)
# Quick inspect
ggplot(oxy_old_ctd, aes(oxy)) +
geom_histogram() +
facet_wrap(~ q_year)ggplot(oxy_old_ctd, aes(lat)) +
facet_wrap(~q_year, scales = "free") +
geom_histogram()ggplot(oxy_old_ctd %>% filter(q_year == "4_2018"), aes(lat)) +
facet_wrap(~filename, scales = "free") +
geom_histogram()Some coordinates are off, but they are also in the raw data
Now more on to the more recent data. 2019 Q4 and 2020 Q1 will have to be done solo because they are single excel file with tabs for each station. It does seem to have station number as tab names so I don’t need to match by coordinates.
Here I don’t have a depth column, only pressue and what I guess is bottom depth. I will keep pressure. We need this to calculate the average oxygen near sea-bottom. Before I said 3 metres above deepest, but I might just go with the highest pressure here for this survey.
setwd(home)
d %>%
filter(year == 2019, quarter == 4) %>%
distinct(smhi_serial_no) %>%
arrange(smhi_serial_no) %>%
pull() [1] 64 65 66 67 68 69 70 72 73 74 78 79 80 82 83 86 87 88 89 90 91 92 93
d %>%
filter(year == 2020, quarter == 1) %>%
distinct(smhi_serial_no) %>%
arrange(smhi_serial_no) %>%
pull() [1] 99 100 101 102 103 104 105 106 107 109 111 112 114 117 118 120 121 122 123
[20] 125 126 127 128 129 130 131 132 133 134 135 136 142 143
# OK, yes, tabs are station numbers (verify in spreadsheet). Read tabs as elements in a list, then loop through
# 2019 Q4
xl_data <- paste0(home, "/data/bits-oxygen/19_11/Bits2019 Q4 redigerad.xlsx")
tab_names <- excel_sheets(path = xl_data)
# Select only tabs with CTD data
tab_names_trim <- tab_names[grepl("CTD", tab_names)]
# Put all tabs in a list. Skip different # rows to get either raw data, id or lat/lon
list_vars_q419 <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 34))
str(list_vars_q419[[1]])tibble [391 × 19] (S3: tbl_df/tbl/data.frame)
$ Date / Time : chr [1:391] "20/11/2019 06:58:26" "20/11/2019 07:03:46" "20/11/2019 07:03:47" "20/11/2019 07:03:49" ...
$ TURBIDITY;FTU : num [1:391] 13.38 2.38 2.38 2.44 2.38 ...
$ DISSOLVED OXYGEN;SAT% : num [1:391] 85.6 83.3 83 82.6 82.6 ...
$ PRESSURE;DBAR : num [1:391] 0.249 0.403 0.395 0.191 0.203 0.415 0.389 0.405 0.383 0.184 ...
$ TEMPERATURE;C : num [1:391] 9.64 9.67 9.67 9.67 9.66 ...
$ CONDUCTIVITY;MS/CM : num [1:391] 9.57 9.56 9.56 9.55 9.55 ...
$ PH;PH : num [1:391] -3.15 -3.15 -3.15 -3.15 -3.15 ...
$ REDOX;MV : num [1:391] 413 414 415 417 417 ...
$ Calc, SALINITY; PSU : num [1:391] 7.82 7.81 7.81 7.8 7.8 ...
$ Calc, DENSITY ANOMALY; KG/M3 [EOS-80]: num [1:391] 1006 1006 1006 1006 1006 ...
$ Calc, SOS; M/SEC : num [1:391] 1456 1456 1456 1456 1456 ...
$ ml/L : num [1:391] 6.48 6.3 6.28 6.25 6.25 ...
$ ...13 : logi [1:391] NA NA NA NA NA NA ...
$ 1463.1255303030291 : logi [1:391] NA NA NA NA NA NA ...
$ 10.745775757575752 : logi [1:391] NA NA NA NA NA NA ...
$ 12.625266666666668 : logi [1:391] NA NA NA NA NA NA ...
$ 10.18184242424242 : logi [1:391] NA NA NA NA NA NA ...
$ 38.911872727272758 : logi [1:391] NA NA NA NA NA NA ...
$ 5.0589626909370216 : logi [1:391] NA NA NA NA NA NA ...
# list_all contains all relevant tabs. Loop through all these
vars_list_q419 <- list()
for(i in 1:length(tab_names_trim)){
# Get the actual CTD data
vars <- list_vars_q419[[i]] %>%
dplyr::rename(temp = `TEMPERATURE;C`,
oxy = `ml/L`,
pressue_bar = `PRESSURE;DBAR`) %>%
dplyr::select(temp, oxy, pressue_bar) %>%
mutate(q_year = paste(4, 2019, sep = "_"),
smhi_serial_no = tab_names_trim[i],
smhi_serial_no = as.numeric(str_remove(smhi_serial_no, "CTD")))
vars_list_q419[[i]] <- vars
}
q4_19 <- bind_rows(vars_list_q419)
# 2020 Q1
xl_data <- paste0(home, "/data/bits-oxygen/20_02/Bits 2020 Q1.xlsx")
tab_names <- excel_sheets(path = xl_data)
# Select only tabs with CTD data
tab_names_trim <- tab_names[grepl("CTD", tab_names)]
# Put all tabs in a list. Skip different # rows to get either raw data, id or lat/lon
list_vars_q120 <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 34))
str(list_vars_q120[[1]])tibble [215 × 19] (S3: tbl_df/tbl/data.frame)
$ Date / Time : chr [1:215] "26/02/2020 05:48:58" "26/02/2020 05:49:00" "26/02/2020 05:49:01" "26/02/2020 05:49:02" ...
$ TURBIDITY;FTU : num [1:215] 2.5 2.5 2.38 2.44 2.38 ...
$ DISSOLVED OXYGEN;SAT% : num [1:215] 72.6 72.8 72.8 72.9 72.9 ...
$ PRESSURE;DBAR : num [1:215] 0.255 0.426 0.614 0.821 1.016 ...
$ TEMPERATURE;C : num [1:215] 5.45 5.46 5.46 5.45 5.46 ...
$ CONDUCTIVITY;MS/CM : num [1:215] 9.25 9.25 9.26 9.26 9.26 ...
$ PH;PH : num [1:215] -3.15 -3.15 -3.15 -3.15 -3.15 ...
$ REDOX;MV : num [1:215] 326 343 348 355 360 ...
$ Calc, SALINITY; PSU : num [1:215] 8.51 8.51 8.51 8.51 8.51 ...
$ Calc, DENSITY ANOMALY; KG/M3 [EOS-80]: num [1:215] 1007 1007 1007 1007 1007 ...
$ Calc, SOS; M/SEC : num [1:215] 1439 1439 1439 1439 1439 ...
$ ml/L : num [1:215] 6.06 6.07 6.08 6.08 6.09 ...
$ ...13 : logi [1:215] NA NA NA NA NA NA ...
$ 1443.4471935483862 : logi [1:215] NA NA NA NA NA NA ...
$ 6.1277580645161311 : logi [1:215] NA NA NA NA NA NA ...
$ 10.384677419354841 : logi [1:215] NA NA NA NA NA NA ...
$ 9.4149139784946225 : logi [1:215] NA NA NA NA NA NA ...
$ 24.387274193548393 : logi [1:215] NA NA NA NA NA NA ...
$ 5.4491714947240055 : logi [1:215] NA NA NA NA NA NA ...
# list_all contains all relevant tabs. Loop through all these
vars_list_q120 <- list()
for(i in 1:length(tab_names_trim)){
# Get the actual CTD data
vars <- list_vars_q120[[i]] %>%
dplyr::rename(temp = `TEMPERATURE;C`,
oxy = `ml/L`,
pressue_bar = `PRESSURE;DBAR`) %>%
dplyr::select(temp, oxy, pressue_bar) %>%
mutate(q_year = paste(1, 2020, sep = "_"),
smhi_serial_no = tab_names_trim[i],
smhi_serial_no = as.numeric(str_remove(smhi_serial_no, "CTD")))
vars_list_q120[[i]] <- vars
}
q1_20 <- bind_rows(vars_list_q120)
unique(q1_20$smhi_serial_no) [1] 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
[20] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
[39] 137 138 139 140 141 143
Ok, now we are entering the times when we have to join by coordinates. It’s also the type of files where this is not stored as a column… First I need to do Q4 2020 again, because unlike the rest of the data that comes after, they are in a single file with tabs, the others have 1 file per station
setwd(home)
xl_data <- paste0(home, "/data/bits-oxygen/20_11/CTD_BITS-06.xlsx")
tab_names <- excel_sheets(path = xl_data)
tab_names_trim <- tab_names[grepl("\\_SBE", tab_names)]
# Put all tabs in a list. Skip different # rows to get either raw data, id or lat/lon
list_vars <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 12))
list_all_id <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x))
list_all_lat <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 8))
list_all_lon <- lapply(tab_names_trim, function(x) read_excel(path = xl_data, sheet = x, skip = 9))
# list_all contains all relevant tabs. Loop through all these
vars_list <- list()
for(i in 1:length(tab_names_trim)){
# Get the actual CTD data
vars <- list_vars[[i]] %>%
dplyr::select(contains(c("deg C", "[true depth, m]", "[ml/l]"))) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3)
# Now get the ID
id <- names(list_all_id[[i]])[1]
id <- strsplit(id, " ")[[1]]
id <- as.numeric(id[length(id)])
vars$smhi_serial_no <- id
# Lat and lon for cross-checking
vars$lat <- names(list_all_lat[[i]])[1]
vars$lon <- names(list_all_lon[[i]])[1]
# Clean up data!
#lat.d.d = format.position(lat.dm.m)
vars <- vars %>%
mutate(temp = as.numeric(temp),
depth = as.numeric(depth),
oxy = as.numeric(oxy),
smhi_serial_no = as.numeric(smhi_serial_no)) %>%
mutate(lat = str_remove(lat, "\\** Latitude: ")) %>%
mutate(lat = str_remove(lat, " N")) %>%
mutate(lat = str_remove(lat, " ")) %>%
mutate(lat = format.position(as.numeric(lat))) %>%
mutate(lat = as.numeric(lat)) %>%
mutate(lon = str_remove(lon, "\\** Longitude: ")) %>%
mutate(lon = str_remove(lon, " E")) %>%
mutate(lon = str_remove(lon, " ")) %>%
mutate(lon = format.position(as.numeric(lon))) %>%
mutate(lon = as.numeric(lon))
vars_list[[i]] <- vars
}
q4_20 <- bind_rows(vars_list) %>% mutate(q_year = paste(4, 2020, sep = "_"))Now do the rest of the recent data. This is q1 2021, q4 2021, q1 2022 and q4 2022
For quarter 1, 2021, the actual data doesn’t start at the same row number… need to split them in two folders. Therefore I do this one separately
# Q1 2021
setwd(paste0(home, "/data/bits-oxygen/21_02/208/"))
filenames <- list.files(pattern = "*.xlsx")
q1_21a <- purrr::map_df(filenames,
~read_excel(.x, skip = 207, col_types = "guess") %>%
dplyr::select(3, 10, 12) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2021, sep = "_"))
q1_21a# A tibble: 5,714 × 5
temp depth oxy filename q_year
<chr> <chr> <chr> <chr> <chr>
1 2.8468 2.025 9.2294 SBE19_6164_20210222_0828_77_10_0204 1_2021
2 2.8456 2.531 9.2259 SBE19_6164_20210222_0828_77_10_0204 1_2021
3 2.8471 3.038 9.2283 SBE19_6164_20210222_0828_77_10_0204 1_2021
4 2.8480 3.544 9.2258 SBE19_6164_20210222_0828_77_10_0204 1_2021
5 2.8482 4.050 9.2141 SBE19_6164_20210222_0828_77_10_0204 1_2021
6 2.8471 4.556 9.2322 SBE19_6164_20210222_0828_77_10_0204 1_2021
7 2.8474 5.063 9.2360 SBE19_6164_20210222_0828_77_10_0204 1_2021
8 2.8494 5.569 9.2330 SBE19_6164_20210222_0828_77_10_0204 1_2021
9 2.8511 6.075 9.2318 SBE19_6164_20210222_0828_77_10_0204 1_2021
10 2.8555 6.581 9.2266 SBE19_6164_20210222_0828_77_10_0204 1_2021
# ℹ 5,704 more rows
# Now I need to find the coordinates, and match them by the filename.
q1_21a_lat <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITS`, "...3", "...4", "filename")) %>%
filter(`Cruise:BITS` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(...3, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q1_21a_lat# A tibble: 41 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5537.46 SBE19_6164_20210222_0828_77_10_0204 55.6
2 5541.45 SBE19_6164_20210222_1028_77_10_0205 55.7
3 5540.96 SBE19_6164_20210222_1609_77_10_0207 55.7
4 5541.48 SBE19_6164_20210223_0536_77_10_0208 55.7
5 5548.500 SBE19_6164_20210223_1354_77_10_0209 55.8
6 5549.38 SBE19_6164_20210223_1559_77_10_0210 55.8
7 5555.77 SBE19_6164_20210224_0500_77_10_0211 55.9
8 5607.94 SBE19_6164_20210224_0720_77_10_0212 56.1
9 5612.67 SBE19_6164_20210224_0922_77_10_0213 56.2
10 5618.93 SBE19_6164_20210224_1141_77_10_0214 56.3
# ℹ 31 more rows
q1_21a_lon <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITS`, "...3", "...4", "filename")) %>%
filter(`Cruise:BITS` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(...3, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q1_21a_lon# A tibble: 41 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1440.76 SBE19_6164_20210222_0828_77_10_0204 14.7
2 14453.23 SBE19_6164_20210222_1028_77_10_0205 14.8
3 1423.67 SBE19_6164_20210222_1609_77_10_0207 14.4
4 1422.64 SBE19_6164_20210223_0536_77_10_0208 14.4
5 1553.94 SBE19_6164_20210223_1354_77_10_0209 15.9
6 1558.77 SBE19_6164_20210223_1559_77_10_0210 16.0
7 1837.55 SBE19_6164_20210224_0500_77_10_0211 18.6
8 1820.57 SBE19_6164_20210224_0720_77_10_0212 18.3
9 1827.74 SBE19_6164_20210224_0922_77_10_0213 18.4
10 1825.56 SBE19_6164_20210224_1141_77_10_0214 18.4
# ℹ 31 more rows
coords_a <- q1_21a_lat %>% left_join(q1_21a_lon, by = "filename") %>% dplyr::select(filename, lat, lon)
coords_a# A tibble: 41 × 3
filename lat lon
<chr> <dbl> <dbl>
1 SBE19_6164_20210222_0828_77_10_0204 55.6 14.7
2 SBE19_6164_20210222_1028_77_10_0205 55.7 14.8
3 SBE19_6164_20210222_1609_77_10_0207 55.7 14.4
4 SBE19_6164_20210223_0536_77_10_0208 55.7 14.4
5 SBE19_6164_20210223_1354_77_10_0209 55.8 15.9
6 SBE19_6164_20210223_1559_77_10_0210 55.8 16.0
7 SBE19_6164_20210224_0500_77_10_0211 55.9 18.6
8 SBE19_6164_20210224_0720_77_10_0212 56.1 18.3
9 SBE19_6164_20210224_0922_77_10_0213 56.2 18.4
10 SBE19_6164_20210224_1141_77_10_0214 56.3 18.4
# ℹ 31 more rows
q1_21a <- q1_21a %>% left_join(coords_a, by = "filename")
q1_21a# A tibble: 5,714 × 7
temp depth oxy filename q_year lat lon
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 2.8468 2.025 9.2294 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
2 2.8456 2.531 9.2259 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
3 2.8471 3.038 9.2283 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
4 2.8480 3.544 9.2258 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
5 2.8482 4.050 9.2141 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
6 2.8471 4.556 9.2322 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
7 2.8474 5.063 9.2360 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
8 2.8494 5.569 9.2330 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
9 2.8511 6.075 9.2318 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
10 2.8555 6.581 9.2266 SBE19_6164_20210222_0828_77_10_0204 1_2021 55.6 14.7
# ℹ 5,704 more rows
# Now do the other type of file
setwd(paste0(home, "/data/bits-oxygen/21_02/387/"))
filenames <- list.files(pattern = "*.xlsx")
q1_21b <- purrr::map_df(filenames,
~read_excel(.x, skip = 387, col_types = "guess") %>%
dplyr::select(3, 10, 12) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3) %>%
mutate(filename = .x)) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2021, sep = "_"))
q1_21b# A tibble: 800 × 5
temp depth oxy filename q_year
<chr> <chr> <chr> <chr> <chr>
1 3.1052 5.061 9.0929 SBE19_6164_20210220_0800_77_10_0194 1_2021
2 3.1009 5.568 9.0868 SBE19_6164_20210220_0800_77_10_0194 1_2021
3 3.0982 6.074 9.0867 SBE19_6164_20210220_0800_77_10_0194 1_2021
4 3.0975 6.580 9.0763 SBE19_6164_20210220_0800_77_10_0194 1_2021
5 3.0973 7.086 9.0834 SBE19_6164_20210220_0800_77_10_0194 1_2021
6 3.0971 7.592 9.0807 SBE19_6164_20210220_0800_77_10_0194 1_2021
7 3.0969 8.098 9.0840 SBE19_6164_20210220_0800_77_10_0194 1_2021
8 3.0965 8.604 9.0761 SBE19_6164_20210220_0800_77_10_0194 1_2021
9 3.0963 9.111 9.0833 SBE19_6164_20210220_0800_77_10_0194 1_2021
10 3.0963 9.617 9.0859 SBE19_6164_20210220_0800_77_10_0194 1_2021
# ℹ 790 more rows
# Now I need to find the coordinates, and match them by the filename.
q1_21b_lat <- purrr::map_df(filenames,
~read_excel(.x, skip = 197, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITS`, "...3", "...4", "filename")) %>%
filter(`Cruise:BITS` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(...3, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q1_21b_lat# A tibble: 9 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5551.91 SBE19_6164_20210220_0800_77_10_0194 55.8
2 5547.27 SBE19_6164_20210220_1013_77_10_0195 55.8
3 5550.14 SBE19_6164_20210220_1409_77_10_0196 55.8
4 5546.30 SBE19_6164_20210220_1632_77_10_0197 55.8
5 5510.37 SBE19_6164_20210221_0530_77_10_0198 55.2
6 5512.71 SBE19_6164_20210221_0953_77_10_0199 55.2
7 5515.32 SBE19_6164_20210221_1209_77_10_0200 55.2
8 5513.78 SBE19_6164_20210221_1638_77_10_0202 55.2
9 5527.71 SBE19_6164_20210222_0531_77_10_0203 55.4
q1_21b_lon <- purrr::map_df(filenames,
~read_excel(.x, skip = 197, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITS`, "...3", "...4", "filename")) %>%
filter(`Cruise:BITS` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(...3, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q1_21b_lon# A tibble: 9 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1603.30 SBE19_6164_20210220_0800_77_10_0194 16.0
2 1558.36 SBE19_6164_20210220_1013_77_10_0195 16.0
3 1531.23 SBE19_6164_20210220_1409_77_10_0196 15.5
4 1523.66 SBE19_6164_20210220_1632_77_10_0197 15.4
5 1320.23 SBE19_6164_20210221_0530_77_10_0198 13.3
6 1335.50 SBE19_6164_20210221_0953_77_10_0199 13.6
7 1340.10 SBE19_6164_20210221_1209_77_10_0200 13.7
8 1355.96 SBE19_6164_20210221_1638_77_10_0202 13.9
9 1433.13 SBE19_6164_20210222_0531_77_10_0203 14.6
coords_b <- q1_21b_lat %>% left_join(q1_21b_lon, by = "filename") %>% dplyr::select(filename, lat, lon)
coords_b# A tibble: 9 × 3
filename lat lon
<chr> <dbl> <dbl>
1 SBE19_6164_20210220_0800_77_10_0194 55.8 16.0
2 SBE19_6164_20210220_1013_77_10_0195 55.8 16.0
3 SBE19_6164_20210220_1409_77_10_0196 55.8 15.5
4 SBE19_6164_20210220_1632_77_10_0197 55.8 15.4
5 SBE19_6164_20210221_0530_77_10_0198 55.2 13.3
6 SBE19_6164_20210221_0953_77_10_0199 55.2 13.6
7 SBE19_6164_20210221_1209_77_10_0200 55.2 13.7
8 SBE19_6164_20210221_1638_77_10_0202 55.2 13.9
9 SBE19_6164_20210222_0531_77_10_0203 55.4 14.6
q1_21b <- q1_21b %>% left_join(coords_b, by = "filename")
q1_21b# A tibble: 800 × 7
temp depth oxy filename q_year lat lon
<chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 3.1052 5.061 9.0929 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
2 3.1009 5.568 9.0868 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
3 3.0982 6.074 9.0867 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
4 3.0975 6.580 9.0763 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
5 3.0973 7.086 9.0834 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
6 3.0971 7.592 9.0807 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
7 3.0969 8.098 9.0840 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
8 3.0965 8.604 9.0761 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
9 3.0963 9.111 9.0833 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
10 3.0963 9.617 9.0859 SBE19_6164_20210220_0800_77_10_0194 1_2021 55.8 16.0
# ℹ 790 more rows
# Combine!
q1_21 <- bind_rows(q1_21a, q1_21b) %>%
mutate(temp = as.numeric(temp),
depth = as.numeric(depth),
oxy = as.numeric(oxy),
lat = as.numeric(as.character(lat)),
lon = as.numeric(as.character(lon)))Continuing with q4 2021… Here corrdinates are reported differently (as in different positions in the string…). And latitude is even misspelled!?!?
# Q4 2021
setwd(paste0(home, "/data/bits-oxygen/21_11/"))
filenames <- list.files(pattern = "*.xlsx")
q4_21 <- purrr::map_df(filenames,
~read_excel(.x, skip = 208, col_types = "guess") %>%
dplyr::select(3, 10, 12) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3) %>%
mutate(filename = .x,
temp = as.numeric(temp),
depth = as.numeric(depth),
oxy = as.numeric(oxy))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(4, 2021, sep = "_"))
# Now I need to find the coordinates, and match them by the filename.
q4_21_lat <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITSQ42021`, "...6", "...7", "filename")) %>%
filter(`Cruise:BITSQ42021` %in% c("Lattitude", "Longitude")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Lattitude", "lat", "lon"),
coords = paste(...6, ...7, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q4_21_lat# A tibble: 37 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5510.760999999999999 SBE19_6164_20211121_0617_77_10_0966 55.2
2 5518.712 SBE19_6164_20211121_1113_77_10_0967 55.3
3 5516.155999999999999 SBE19_6164_20211121_1314_77_10_0968 55.3
4 5527.439 SBE19_6164_20211122_0555_77_10_0969 55.4
5 5536.173000000000002 SBE19_6164_20211122_0859_77_10_0970 55.6
6 5540.664000000000001 SBE19_6164_20211122_1122_77_10_0971 55.7
7 5542.558 SBE19_6164_20211122_1411_77_10_0972 55.7
8 5541.6 SBE19_6164_20211123_0553_77_10_0973 55.7
9 5539.825000000000003 SBE19_6164_20211123_0922_77_10_0974 55.6
10 5538 SBE19_6164_20211123_1125_77_10_0975 55.6
# ℹ 27 more rows
q4_21_lon <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:BITSQ42021`, "...6", "...7", "filename")) %>%
filter(`Cruise:BITSQ42021` %in% c("Lattitude", "Longitude")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Lattitude", "lat", "lon"),
coords = paste(...6, ...7, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q4_21_lon# A tibble: 37 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1320.062999999999999 SBE19_6164_20211121_0617_77_10_0966 13.3
2 1351.265999999999998 SBE19_6164_20211121_1113_77_10_0967 13.8
3 1357.898000000000003 SBE19_6164_20211121_1314_77_10_0968 14.0
4 1437.692 SBE19_6164_20211122_0555_77_10_0969 14.6
5 1437.158000000000001 SBE19_6164_20211122_0859_77_10_0970 14.6
6 1423.521999999999998 SBE19_6164_20211122_1122_77_10_0971 14.4
7 1425.649000000000001 SBE19_6164_20211122_1411_77_10_0972 14.4
8 1422.651 SBE19_6164_20211123_0553_77_10_0973 14.4
9 1430.541 SBE19_6164_20211123_0922_77_10_0974 14.5
10 1440 SBE19_6164_20211123_1125_77_10_0975 14.7
# ℹ 27 more rows
coords <- q4_21_lat %>% left_join(q4_21_lon, by = "filename") %>% dplyr::select(filename, lat, lon)
coords# A tibble: 37 × 3
filename lat lon
<chr> <dbl> <dbl>
1 SBE19_6164_20211121_0617_77_10_0966 55.2 13.3
2 SBE19_6164_20211121_1113_77_10_0967 55.3 13.8
3 SBE19_6164_20211121_1314_77_10_0968 55.3 14.0
4 SBE19_6164_20211122_0555_77_10_0969 55.4 14.6
5 SBE19_6164_20211122_0859_77_10_0970 55.6 14.6
6 SBE19_6164_20211122_1122_77_10_0971 55.7 14.4
7 SBE19_6164_20211122_1411_77_10_0972 55.7 14.4
8 SBE19_6164_20211123_0553_77_10_0973 55.7 14.4
9 SBE19_6164_20211123_0922_77_10_0974 55.6 14.5
10 SBE19_6164_20211123_1125_77_10_0975 55.6 14.7
# ℹ 27 more rows
q4_21 <- q4_21 %>% left_join(coords, by = "filename")Continuing with q1 2022…
# Q1 2022
setwd(paste0(home, "/data/bits-oxygen/22_02/"))
filenames <- list.files(pattern = "*.xlsx")
q1_22 <- purrr::map_df(filenames,
~read_excel(.x, skip = 208, col_types = "guess") %>%
dplyr::select(3, 10, 12) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3) %>%
mutate(filename = .x,
temp = as.numeric(temp),
depth = as.numeric(depth),
oxy = as.numeric(oxy))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(1, 2022, sep = "_"))
q1_22# A tibble: 5,578 × 5
temp depth oxy filename q_year
<dbl> <dbl> <dbl> <chr> <chr>
1 4.07 1.01 8.37 SBE19_6164_20220222_0610_77_10_0170 1_2022
2 4.07 1.52 8.36 SBE19_6164_20220222_0610_77_10_0170 1_2022
3 4.07 2.02 8.35 SBE19_6164_20220222_0610_77_10_0170 1_2022
4 4.08 2.53 8.36 SBE19_6164_20220222_0610_77_10_0170 1_2022
5 4.08 3.04 8.35 SBE19_6164_20220222_0610_77_10_0170 1_2022
6 4.08 3.54 8.34 SBE19_6164_20220222_0610_77_10_0170 1_2022
7 4.07 4.05 8.34 SBE19_6164_20220222_0610_77_10_0170 1_2022
8 4.07 4.56 8.35 SBE19_6164_20220222_0610_77_10_0170 1_2022
9 4.06 5.06 8.35 SBE19_6164_20220222_0610_77_10_0170 1_2022
10 4.06 5.57 8.35 SBE19_6164_20220222_0610_77_10_0170 1_2022
# ℹ 5,568 more rows
# Now I need to find the coordinates, and match them by the filename.
q1_22_lat <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "...6", "...7", "filename")) %>%
filter(`Cruise:` %in% c("Lattitude", "Longitude")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Lattitude", "lat", "lon"),
coords = paste(...6, ...7, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q1_22_lat# A tibble: 47 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5541.566 SBE19_6164_20220222_0610_77_10_0170 55.7
2 5537.281 SBE19_6164_20220222_1013_77_10_0171 55.6
3 5549.913 SBE19_6164_20220222_1351_77_10_0172 55.8
4 5550.740 SBE19_6164_20220222_1640_77_10_0173 55.8
5 5541.217 SBE19_6164_20220223_0527_77_10_0174 55.7
6 5540.675 SBE19_6164_20220223_0751_77_10_0175 55.7
7 5540 SBE19_6164_20220223_1110_77_10_0176 55.7
8 5548.412 SBE19_6164_20220223_1634_77_10_0177 55.8
9 5510.832 SBE19_6164_20220224_0525_77_10_0178 55.2
10 5514.305 SBE19_6164_20220224_0906_77_10_0179 55.2
# ℹ 37 more rows
q1_22_lon <- purrr::map_df(filenames,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "...6", "...7", "filename")) %>%
filter(`Cruise:` %in% c("Lattitude", "Longitude")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Lattitude", "lat", "lon"),
coords = paste(...6, ...7, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q1_22_lon# A tibble: 47 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1424.237 SBE19_6164_20220222_0610_77_10_0170 14.4
2 1445.683 SBE19_6164_20220222_1013_77_10_0171 14.8
3 1517.850 SBE19_6164_20220222_1351_77_10_0172 15.3
4 1534.110 SBE19_6164_20220222_1640_77_10_0173 15.6
5 1423.249 SBE19_6164_20220223_0527_77_10_0174 14.4
6 1425.381 SBE19_6164_20220223_0751_77_10_0175 14.4
7 1446.195 SBE19_6164_20220223_1110_77_10_0176 14.8
8 1558.088 SBE19_6164_20220223_1634_77_10_0177 16.0
9 1319.256 SBE19_6164_20220224_0525_77_10_0178 13.3
10 1339.589 SBE19_6164_20220224_0906_77_10_0179 13.6
# ℹ 37 more rows
coords <- q1_22_lat %>% left_join(q1_22_lon, by = "filename") %>% dplyr::select(filename, lat, lon)
coords# A tibble: 47 × 3
filename lat lon
<chr> <dbl> <dbl>
1 SBE19_6164_20220222_0610_77_10_0170 55.7 14.4
2 SBE19_6164_20220222_1013_77_10_0171 55.6 14.8
3 SBE19_6164_20220222_1351_77_10_0172 55.8 15.3
4 SBE19_6164_20220222_1640_77_10_0173 55.8 15.6
5 SBE19_6164_20220223_0527_77_10_0174 55.7 14.4
6 SBE19_6164_20220223_0751_77_10_0175 55.7 14.4
7 SBE19_6164_20220223_1110_77_10_0176 55.7 14.8
8 SBE19_6164_20220223_1634_77_10_0177 55.8 16.0
9 SBE19_6164_20220224_0525_77_10_0178 55.2 13.3
10 SBE19_6164_20220224_0906_77_10_0179 55.2 13.6
# ℹ 37 more rows
q1_22 <- q1_22 %>% left_join(coords, by = "filename")Final dataset! q4 2022… And of course, here the CTD can spell to latitude again. And, the coordinates again end up in a different string… And best of all, half the files have coordinates spanning over 3 columns, other two…
# Q4 2022
setwd(paste0(home, "/data/bits-oxygen/22_11/"))
filenames <- list.files(pattern = "*.xlsx")
q4_22 <- purrr::map_df(filenames,
~read_excel(.x, skip = 208, col_types = "guess") %>%
dplyr::select(3, 10, 12) %>%
dplyr::rename(temp = 1,
depth = 2,
oxy = 3) %>%
mutate(filename = .x,
temp = as.numeric(temp),
depth = as.numeric(depth),
oxy = as.numeric(oxy))) %>%
mutate(filename = str_remove(filename, ".xlsx"),
q_year = paste(4, 2022, sep = "_"))
q4_22# A tibble: 4,316 × 5
temp depth oxy filename q_year
<dbl> <dbl> <dbl> <chr> <chr>
1 8.43 2.03 7.36 SBE19_6164_20221118_0821_77_10_0979 4_2022
2 8.43 2.53 7.37 SBE19_6164_20221118_0821_77_10_0979 4_2022
3 8.43 3.04 7.36 SBE19_6164_20221118_0821_77_10_0979 4_2022
4 8.43 3.55 7.36 SBE19_6164_20221118_0821_77_10_0979 4_2022
5 8.43 4.05 7.35 SBE19_6164_20221118_0821_77_10_0979 4_2022
6 8.43 4.56 7.35 SBE19_6164_20221118_0821_77_10_0979 4_2022
7 8.43 5.07 7.35 SBE19_6164_20221118_0821_77_10_0979 4_2022
8 8.43 5.57 7.36 SBE19_6164_20221118_0821_77_10_0979 4_2022
9 8.43 6.08 7.36 SBE19_6164_20221118_0821_77_10_0979 4_2022
10 8.43 6.58 7.35 SBE19_6164_20221118_0821_77_10_0979 4_2022
# ℹ 4,306 more rows
# Now I need to find the coordinates, and match them by the filename.
# The latter half have colnames spanning 3 columns, the first half only two...
filenames_a <- filenames[1:18] # Look at this indexing...
q4_22_lat_a <- purrr::map_df(filenames_a,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "BITS", "...4", "filename")) %>% # not the difference in where the coordinates end up
filter(`Cruise:` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(BITS, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q4_22_lat_a# A tibble: 18 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5723.161999999999999 SBE19_6164_20221118_0821_77_10_0979 57.4
2 5726.398 SBE19_6164_20221118_1027_77_10_0980 57.4
3 5717.711 SBE19_6164_20221118_1512_77_10_0982 57.3
4 5539.884 SBE19_6164_20221119_0536_77_10_0983 55.6
5 5539.884 SBE19_6164_20221119_0550_77_10_LERR 55.6
6 5547.846 SBE19_6164_20221119_1006_77_10_0984 55.8
7 5550.565 SBE19_6164_20221119_1209_77_10_0985 55.8
8 5550.858 SBE19_6164_20221119_1401_77_10_0986 55.8
9 5511.166 SBE19_6164_20221120_0542_77_10_0987 55.2
10 5516.307 SBE19_6164_20221120_1047_77_10_0988 55.3
11 5515.930 SBE19_6164_20221120_1303_77_10_0989 55.2
12 5553.184 SBE19_6164_20221121_0538_77_10_0991 55.9
13 5549.433 SBE19_6164_20221121_0931_77_10_0992 55.8
14 5547.467 SBE19_6164_20221121_1150_77_10_0993 55.8
15 5545.869 SBE19_6164_20221121_1400_77_10_0994 55.8
16 5703.794 SBE19_6164_20221122_0801_77_10_0995 57.0
17 5705.684 SBE19_6164_20221122_0850_77_10_0996 57.1
18 5710.590 SBE19_6164_20221122_1139_77_10_0997 57.2
filenames_b <- filenames[19:length(filenames)]
q4_22_lat_b <- purrr::map_df(filenames_b,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "BITS", "...4", "...5", "filename")) %>% # not the difference in where the coordinates end up
filter(`Cruise:` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(BITS, paste(...4, ...5, sep = "."), sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lat") %>%
dplyr::rename(lat_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lat_minute = str_remove(lat_minute, "\\.N")) %>% # Note this
mutate(lat = format.position(as.numeric(lat_minute)),
filename = str_remove(filename, ".xlsx"))
q4_22_lat_b# A tibble: 17 × 3
lat_minute filename lat
<chr> <chr> <dbl>
1 5714.149 SBE19_6164_20221122_1241_77_10_0998 57.2
2 5721.94 SBE19_6164_20221122_1506_77_10_0999 57.4
3 5720.885 SBE19_6164_20221122_1546_77_10_1000 57.3
4 5750.759 SBE19_6164_20221123_0554_77_10_1001 57.8
5 5754.319 SBE19_6164_20221123_0934_77_10_1002 57.9
6 582.551 SBE19_6164_20221123_1051_77_10_1003 58.0
7 5737.706 SBE19_6164_20221124_0506_77_10_1004 57.6
8 575.733 SBE19_6164_20221124_1408_77_10_1006 57.1
9 572.523 SBE19_6164_20221124_1642_77_10_1006 57.0
10 5658.380 SBE19_6164_20221125_0550_77_10_1008 57.0
11 5716.62 SBE19_6164_20221125_1112_77_10_1009 57.3
12 5654.765 SBE19_6164_20221126_0556_77_10_1010 56.9
13 5641.639 SBE19_6164_20221126_1033_77_10_1012 56.7
14 5630.65 SBE19_6164_20221126_1315_77_10_1013 56.5
15 5527.672 SBE19_6164_20221127_0505_77_10_1014 55.4
16 5541.535 SBE19_6164_20221127_1041_77_10_1015 55.7
17 5541.143 SBE19_6164_20221127_1409_77_10_1017 55.7
# Here's how I found how to split
# : it doesn't split all excel-files in the same way, so I can't extract the coordinates properly...
# t <- q4_22_lat %>% filter(E_N == "lat")
# unique(t$BITS)
# unique(t$...4)
# t %>% filter(...4 == 21)
# t %>% filter(...4 == 26.398)
q4_22_lon_a <- purrr::map_df(filenames_a,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "BITS", "...4", "filename")) %>% # not the difference in where the coordinates end up
filter(`Cruise:` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(BITS, ...4, sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q4_22_lon_a# A tibble: 18 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1654.673 SBE19_6164_20221118_0821_77_10_0979 16.9
2 1658.574 SBE19_6164_20221118_1027_77_10_0980 17.0
3 1725.705 SBE19_6164_20221118_1512_77_10_0982 17.4
4 1612.837 SBE19_6164_20221119_0536_77_10_0983 16.2
5 1612.837 SBE19_6164_20221119_0550_77_10_LERR 16.2
6 1558.516 SBE19_6164_20221119_1006_77_10_0984 16.0
7 1600.002 SBE19_6164_20221119_1209_77_10_0985 16
8 1604.391 SBE19_6164_20221119_1401_77_10_0986 16.1
9 1335.007 SBE19_6164_20221120_0542_77_10_0987 13.6
10 1344.758 SBE19_6164_20221120_1047_77_10_0988 13.7
11 1354.338 SBE19_6164_20221120_1303_77_10_0989 13.9
12 1555.570 SBE19_6164_20221121_0538_77_10_0991 15.9
13 1529.059 SBE19_6164_20221121_0931_77_10_0992 15.5
14 1515.854 SBE19_6164_20221121_1150_77_10_0993 15.2
15 1529.996 SBE19_6164_20221121_1400_77_10_0994 15.5
16 1851.438 SBE19_6164_20221122_0801_77_10_0995 18.8
17 1857.582 SBE19_6164_20221122_0850_77_10_0996 19.0
18 1854.918 SBE19_6164_20221122_1139_77_10_0997 18.9
q4_22_lon_b <- purrr::map_df(filenames_b,
~read_excel(.x, skip = 20, col_types = "guess") %>%
mutate(filename = .x)) %>%
dplyr::select(c(`Cruise:`, "BITS", "...4", "...5", "filename")) %>% # not the difference in where the coordinates end up
filter(`Cruise:` %in% c("Latitude:", "Longitude:")) %>%
dplyr::rename(E_N = 1) %>%
mutate(E_N = ifelse(E_N == "Latitude:", "lat", "lon"),
coords = paste(BITS, paste(...4, ...5, sep = "."), sep = "")) %>%
dplyr::select(E_N, coords, filename) %>%
filter(E_N == "lon") %>%
dplyr::rename(lon_minute = coords) %>%
dplyr::select(-E_N) %>%
mutate(lon_minute = str_remove(lon_minute, "\\.E")) %>% # Note this
mutate(lon = format.position(as.numeric(lon_minute)),
filename = str_remove(filename, ".xlsx"))
q4_22_lon_b# A tibble: 17 × 3
lon_minute filename lon
<chr> <chr> <dbl>
1 1904.928 SBE19_6164_20221122_1241_77_10_0998 19.1
2 196.691 SBE19_6164_20221122_1506_77_10_0999 19.1
3 199.923 SBE19_6164_20221122_1546_77_10_1000 19.2
4 1929.939 SBE19_6164_20221123_0554_77_10_1001 19.5
5 1927.262 SBE19_6164_20221123_0934_77_10_1002 19.4
6 1930.352 SBE19_6164_20221123_1051_77_10_1003 19.5
7 177.876 SBE19_6164_20221124_0506_77_10_1004 17.1
8 1720.160 SBE19_6164_20221124_1408_77_10_1006 17.3
9 1751.198 SBE19_6164_20221124_1642_77_10_1006 17.8
10 1755.73 SBE19_6164_20221125_0550_77_10_1008 17.9
11 1726.150 SBE19_6164_20221125_1112_77_10_1009 17.4
12 1711.312 SBE19_6164_20221126_0556_77_10_1010 17.2
13 171.53 SBE19_6164_20221126_1033_77_10_1012 17.0
14 1656.90 SBE19_6164_20221126_1315_77_10_1013 16.9
15 1433.258 SBE19_6164_20221127_0505_77_10_1014 14.6
16 1422.920 SBE19_6164_20221127_1041_77_10_1015 14.4
17 1429.208 SBE19_6164_20221127_1409_77_10_1017 14.5
q4_22_lat <- bind_rows(q4_22_lat_a, q4_22_lat_b)
q4_22_lon <- bind_rows(q4_22_lon_a, q4_22_lon_b)
coords <- q4_22_lat %>% left_join(q4_22_lon, by = "filename") %>% dplyr::select(filename, lat, lon)
coords# A tibble: 35 × 3
filename lat lon
<chr> <dbl> <dbl>
1 SBE19_6164_20221118_0821_77_10_0979 57.4 16.9
2 SBE19_6164_20221118_1027_77_10_0980 57.4 17.0
3 SBE19_6164_20221118_1512_77_10_0982 57.3 17.4
4 SBE19_6164_20221119_0536_77_10_0983 55.6 16.2
5 SBE19_6164_20221119_0550_77_10_LERR 55.6 16.2
6 SBE19_6164_20221119_1006_77_10_0984 55.8 16.0
7 SBE19_6164_20221119_1209_77_10_0985 55.8 16
8 SBE19_6164_20221119_1401_77_10_0986 55.8 16.1
9 SBE19_6164_20221120_0542_77_10_0987 55.2 13.6
10 SBE19_6164_20221120_1047_77_10_0988 55.3 13.7
# ℹ 25 more rows
q4_22 <- q4_22 %>% left_join(coords, by = "filename")
# Drop strange coords for now
hist(q4_22$lat)hist(q4_22$lon)q4_22 <- q4_22 %>% filter(lon > 10)Combine all oxygen CTD’s with the matching key. Then summarise them by key, year and quarter before joining.
Note also the depth issue here. Seems like it’s bimodal, with unrealistic values that seem to be one digit too large!?
oxy_with_key <- bind_rows(oxy_old_ctd, # multiple file data with serial no
q4_19, q1_20) %>% # single file data with serial no
mutate(smhi_serial_no_all = paste(q_year, smhi_serial_no, sep = "_"))
# Summarise data before joining with trawl data. First do it on the depth data.
oxy_with_key_trim_depth <- oxy_with_key %>%
filter(!is.na(depth)) %>%
group_by(smhi_serial_no_all) %>%
mutate(ctd_depth = ifelse(depth > max(depth) -3, "near bottom", "pelagic")) %>%
ungroup() %>%
filter(ctd_depth == "near bottom") %>%
group_by(smhi_serial_no_all) %>%
summarise(oxy = mean(oxy),
depth = mean(depth),
temp = mean(temp))
oxy_with_key_trim_depth# A tibble: 256 × 4
smhi_serial_no_all oxy depth temp
<chr> <dbl> <dbl> <dbl>
1 1_2016_1 7.53 59.9 3.72
2 1_2016_11 4.51 50.5 7.34
3 1_2016_13 8.12 40.6 3.92
4 1_2016_15 8.07 46.6 4.13
5 1_2016_17 6.39 56.0 4.47
6 1_2016_19 4.02 56.5 4.97
7 1_2016_22 3.72 62.9 4.87
8 1_2016_24 8.40 39.6 3.58
9 1_2016_26 1.88 68.9 5.29
10 1_2016_27 2.70 108. 7.83
# ℹ 246 more rows
oxy_with_key_trim_depth_NA <- oxy_with_key %>%
filter(is.na(depth)) %>%
group_by(smhi_serial_no_all) %>%
filter(pressue_bar == max(pressue_bar)) %>%
group_by(smhi_serial_no_all)
oxy_with_key_trim_depth_NA# A tibble: 78 × 10
# Groups: smhi_serial_no_all [74]
temp oxy depth lat lon filename q_year smhi_serial_no pressue_bar
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
1 12.0 0.964 NA NA NA <NA> 4_2019 64 71.2
2 12.9 2.40 NA NA NA <NA> 4_2019 65 59.0
3 8.85 6.22 NA NA NA <NA> 4_2019 66 38.6
4 8.84 0 NA NA NA <NA> 4_2019 67 38.6
5 8.84 0 NA NA NA <NA> 4_2019 67 38.6
6 9.09 5.78 NA NA NA <NA> 4_2019 68 49.4
7 8.74 6.40 NA NA NA <NA> 4_2019 69 40.6
8 5.58 0.361 NA NA NA <NA> 4_2019 70 76.6
9 5.67 0.152 NA NA NA <NA> 4_2019 71 101.
10 5.18 1.14 NA NA NA <NA> 4_2019 72 72.2
# ℹ 68 more rows
# ℹ 1 more variable: smhi_serial_no_all <chr>
oxy_with_key_trim_join <- bind_rows(oxy_with_key_trim_depth,
oxy_with_key_trim_depth_NA) %>%
dplyr::select(smhi_serial_no_all, temp, oxy, depth)
oxy_with_key_trim_join# A tibble: 334 × 4
smhi_serial_no_all temp oxy depth
<chr> <dbl> <dbl> <dbl>
1 1_2016_1 3.72 7.53 59.9
2 1_2016_11 7.34 4.51 50.5
3 1_2016_13 3.92 8.12 40.6
4 1_2016_15 4.13 8.07 46.6
5 1_2016_17 4.47 6.39 56.0
6 1_2016_19 4.97 4.02 56.5
7 1_2016_22 4.87 3.72 62.9
8 1_2016_24 3.58 8.40 39.6
9 1_2016_26 5.29 1.88 68.9
10 1_2016_27 7.83 2.70 108.
# ℹ 324 more rows
# Join with data!
d_with_key <- d %>% filter(q_year %in% c(unique(oxy_with_key$q_year)))
d_with_key <- d_with_key %>% left_join(oxy_with_key_trim_join, by = "smhi_serial_no_all")
# Which ones are missing?
d_with_key %>%
filter(is.na(oxy)) lon lat X Y year quarter smhi_serial_no haul_id
1 14.65000 55.46667 477.8717 6146.779 2017 4 3 2017_4_4
2 14.81667 55.68333 488.4726 6170.852 2018 1 11 2018_1_12
3 15.25000 55.76667 515.6857 6180.139 2018 1 16 2018_1_15
4 15.38333 55.75000 524.0617 6178.322 2018 1 17 2018_1_18
5 15.78333 55.63333 549.3158 6165.550 2018 1 20 2018_1_19
6 14.36667 55.70000 460.1953 6172.873 2020 1 142 2020_1_91
bottom_depth q_year smhi_serial_no_all temp oxy depth
1 67 4_2017 4_2017_3 NA NA NA
2 59 1_2018 1_2018_11 NA NA NA
3 56 1_2018 1_2018_16 NA NA NA
4 61 1_2018 1_2018_17 NA NA NA
5 68 1_2018 1_2018_20 NA NA NA
6 38 1_2020 1_2020_142 NA NA NA
# Don't know why these are missing but we will move on.
# Next we can check how close to the bottom our average is. Before doing that, check depth again.
d_with_key %>%
mutate(depth2 = ifelse(bottom_depth > 250, "deeper than 250 m", "normal")) %>%
ggplot(aes(lon, lat, color = as.factor(year), shape = as.factor(quarter))) +
geom_point() +
facet_wrap(~ depth2, ncol = 1) +
coord_sf()d_with_key %>%
mutate(depth2 = ifelse(bottom_depth > 250, "deeper than 250 m", "normal")) %>%
group_by(year, depth2) %>%
summarise(n = n()) %>%
ggplot(aes(year, n, fill = depth2)) +
geom_bar(stat = "identity")# I am going to assume that if depth is deeper than 250, I divide by 10, because in script 01, this gives a very good fit to raster-derived depth data
d_with_key %>%
drop_na(depth) %>% # These are the ones without any depth info
mutate(bottom_depth2 = ifelse(bottom_depth > 250, bottom_depth/10, bottom_depth),
depth_diff = bottom_depth2 - depth) %>%
ggplot(aes(smhi_serial_no_all, depth_diff)) +
geom_point()Even though correcting for the depth issue and looking only at data where we have depth from trawl and from CTD, we can still see clearly that the deepest recorded CTD data is not always near the bottom.
Now do the ones with missing keys where we have to match by spatial location.
oxy_without_key <- bind_rows(q4_20, # single file data without serial
q1_21, q4_21, q1_22, q4_22) # multiple file data without serial
oxy_without_key <- oxy_without_key %>%
filter(lon > 5)
# Summarize data!
oxy_without_key_trim <- oxy_without_key %>%
mutate(id = paste(lat, lon, q_year, sep = "_")) %>% # remember we can't use smhi serial no here because of missing values
group_by(id) %>%
mutate(ctd_depth = ifelse(depth > max(depth) -3, "near bottom", "pelagic")) %>%
ungroup() %>%
filter(ctd_depth == "near bottom") %>%
group_by(id) %>%
summarise(oxy = mean(oxy),
depth = mean(depth),
temp = mean(temp))
# https://stackoverflow.com/questions/59621797/evaluating-the-closest-distance-from-one-point-between-multiple-options
trawl_sf <- d %>%
# Filter q_years in the no-key ctd data
filter(q_year %in% c(unique(oxy_without_key$q_year))) %>%
st_as_sf(coords = c("lon", "lat"), remove = FALSE) %>%
st_set_crs(4326)
ctd_sf <- oxy_without_key_trim %>%
separate(id, sep = "_", into = c("lat", "lon", "quarter", "year"), convert = TRUE) %>%
mutate(q_year = paste(quarter, year, sep = "_")) %>%
drop_na(lat) %>%
drop_na(lon) %>%
st_as_sf(coords = c("lon", "lat"), remove = FALSE) %>%
st_set_crs(4326)
lis <- list()
for(i in unique(oxy_without_key$q_year)){
trawl_sf_i <- filter(trawl_sf, q_year == i)
ctd_sf_i <- filter(ctd_sf, q_year == i)
df_near <- st_join(trawl_sf_i, ctd_sf_i, join = st_nearest_feature)
# Check distance in metres
dist <- trawl_sf_i %>%
cbind(
ctd_sf_i[st_nearest_feature(trawl_sf_i, ctd_sf_i),]) %>%
mutate(dist = st_distance(geometry, geometry.1, by_element = T)) %>%
arrange(desc(dist)) %>%
as.data.frame()
dist
# Plot on map
print(st_connect(trawl_sf_i, ctd_sf_i) %>%
mapview::mapview() +
mapview::mapview(trawl_sf, color = 'tomato', col.regions = 'tomato') +
mapview::mapview(ctd_sf, color = 'steelblue', col.regions = 'steelblue'))
# Here we can see which values of CTD are matched to trawl data
print(ggplot() +
geom_sf(data = trawl_sf, aes(color = "trawl"), size = 3, alpha = .6) +
geom_sf(data = ctd_sf, aes(color = "ctd"), alpha = .6) +
scale_color_manual(values = c("steelblue", "tomato"), name = "Data type") +
geom_sf(data = st_connect(trawl_sf, ctd_sf), linewidth = 0.2))
lis[[i]] <- df_near %>%
as.data.frame() %>%
dplyr::select(haul_id, oxy, depth, temp) %>%
mutate(q_year = i,
dist_m = as.numeric(str_remove(dist$dist, " [m]")))
}Calculating nearest IDs
|
| | 0%
|
|==== | 6%
|
|======== | 12%
|
|============ | 18%
|
|================ | 24%
|
|===================== | 29%
|
|========================= | 35%
|
|============================= | 41%
|
|================================= | 47%
|
|===================================== | 53%
|
|========================================= | 59%
|
|============================================= | 65%
|
|================================================= | 71%
|
|====================================================== | 76%
|
|========================================================== | 82%
|
|============================================================== | 88%
|
|================================================================== | 94%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|== | 2%
|
|=== | 5%
|
|===== | 7%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 17%
|
|============== | 20%
|
|=============== | 22%
|
|================= | 24%
|
|=================== | 27%
|
|==================== | 29%
|
|====================== | 32%
|
|======================== | 34%
|
|========================== | 37%
|
|=========================== | 39%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================ | 46%
|
|================================== | 49%
|
|==================================== | 51%
|
|====================================== | 54%
|
|======================================= | 56%
|
|========================================= | 59%
|
|=========================================== | 61%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 68%
|
|================================================== | 71%
|
|=================================================== | 73%
|
|===================================================== | 76%
|
|======================================================= | 78%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|=================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|=== | 4%
|
|===== | 7%
|
|======== | 11%
|
|========== | 15%
|
|============= | 19%
|
|================ | 22%
|
|================== | 26%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================= | 41%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================= | 59%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|==================================================== | 74%
|
|====================================================== | 78%
|
|========================================================= | 81%
|
|============================================================ | 85%
|
|============================================================== | 89%
|
|================================================================= | 93%
|
|=================================================================== | 96%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|========= | 12%
|
|============ | 17%
|
|=============== | 21%
|
|================== | 25%
|
|==================== | 29%
|
|======================= | 33%
|
|========================== | 38%
|
|============================= | 42%
|
|================================ | 46%
|
|=================================== | 50%
|
|====================================== | 54%
|
|========================================= | 58%
|
|============================================ | 62%
|
|=============================================== | 67%
|
|================================================== | 71%
|
|==================================================== | 75%
|
|======================================================= | 79%
|
|========================================================== | 83%
|
|============================================================= | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|=== | 4%
|
|====== | 8%
|
|======== | 12%
|
|=========== | 16%
|
|============== | 20%
|
|================= | 24%
|
|==================== | 28%
|
|====================== | 32%
|
|========================= | 36%
|
|============================ | 40%
|
|=============================== | 44%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 56%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 68%
|
|================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 80%
|
|=========================================================== | 84%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
Calculating lines
Calculating nearest IDs
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|==== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|===================================== | 52%
|
|===================================== | 53%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
Calculating lines
oxy_without_key_trim_join <- bind_rows(lis)
# Join these data into the trawl data by haul_id
d_without_key <- d %>% filter(q_year %in% c(unique(oxy_without_key$q_year)))
d_without_key <- d_without_key %>% left_join(oxy_without_key_trim_join, by = "haul_id")
# Now, I don't have any missing environmental variables because I join by nearest... but that means distances can be waaay off
# Removing values further away than... 5 km... ?! That is 20% of the data
hist(d_without_key$dist_m)d_without_key <- d_without_key %>%
mutate(oxy = ifelse(dist_m > 5000, NA, oxy),
temp = ifelse(dist_m > 5000, NA, temp),
depth = ifelse(dist_m > 5000, NA, depth))
# Which ones are missing? Pretty spread out. Do not suspect a systematic error here
d_without_key %>%
filter(is.na(oxy)) lon lat X Y year quarter smhi_serial_no haul_id
1 17.01667 57.40000 621.1803 6363.710 2020 4 704 2020_4_213
2 19.16667 57.60000 748.9352 6391.826 2020 4 710 2020_4_218
3 13.60000 55.20000 410.8932 6117.942 2020 4 730 2020_4_237
4 13.91667 55.28333 431.1914 6126.856 2020 4 NA 2020_4_238
5 13.95000 55.25000 433.2526 6123.114 2020 4 NA 2020_4_239
6 16.06667 55.86667 566.7531 6191.754 2021 1 194 2021_1_54
7 13.71667 55.26667 418.4548 6125.217 2021 1 NA 2021_1_60
8 13.91667 55.28333 431.1914 6126.856 2021 1 NA 2021_1_61
9 13.98333 55.25000 435.3715 6123.083 2021 1 NA 2021_1_62
10 14.46667 55.45000 466.2665 6144.998 2021 1 203 2021_1_63
11 15.95000 55.78333 559.5795 6182.374 2021 1 195 2021_1_55
12 14.68333 55.63333 480.0637 6165.317 2021 1 204 2021_1_64
13 14.85000 55.68333 490.5685 6170.847 2021 1 205 2021_1_65
14 13.30000 55.16667 391.7102 6114.657 2021 4 966 2021_4_224
15 14.58333 55.60000 473.7457 6161.641 2021 4 970 2021_4_228
16 14.36667 55.70000 460.1953 6172.873 2022 1 170 2022_1_50
17 14.45000 55.66667 465.4032 6169.119 2022 1 175 2022_1_55
18 14.73333 55.66667 483.2257 6169.014 2022 1 NA 2022_1_56
19 15.91667 55.78333 557.4890 6182.346 2022 1 177 2022_1_57
20 16.91667 57.35000 615.3292 6357.972 2022 4 NA 2022_4_240
21 13.63333 55.20000 413.0146 6117.900 2022 4 987 2022_4_249
22 13.91667 55.28333 431.1914 6126.856 2022 4 NA 2022_4_251
23 15.90000 55.78333 556.4438 6182.332 2022 4 NA 2022_4_245
24 15.56667 55.90000 535.4327 6195.095 2022 4 991 2022_4_253
25 15.55000 55.83333 534.4496 6187.667 2022 4 NA 2022_4_254
26 15.26667 55.81667 516.7100 6185.708 2022 4 NA 2022_4_255
bottom_depth q_year.x smhi_serial_no_all oxy depth temp q_year.y dist_m
1 552 4_2020 4_2020_704 NA NA NA 4_2020 54285.507
2 465 4_2020 4_2020_710 NA NA NA 4_2020 49514.482
3 433 4_2020 4_2020_730 NA NA NA 4_2020 46388.757
4 371 4_2020 4_2020_NA NA NA NA 4_2020 14921.230
5 381 4_2020 4_2020_NA NA NA NA 4_2020 6327.592
6 551 1_2021 1_2021_194 NA NA NA 1_2021 9970.532
7 404 1_2021 1_2021_NA NA NA NA 1_2021 7421.875
8 366 1_2021 1_2021_NA NA NA NA 1_2021 6354.334
9 373 1_2021 1_2021_NA NA NA NA 1_2021 6326.300
10 547 1_2021 1_2021_203 NA NA NA 1_2021 6290.197
11 612 1_2021 1_2021_195 NA NA NA 1_2021 5635.703
12 611 1_2021 1_2021_204 NA NA NA 1_2021 5272.994
13 53 1_2021 1_2021_205 NA NA NA 1_2021 5226.216
14 418 4_2021 4_2021_966 NA NA NA 4_2021 10642.698
15 659 4_2021 4_2021_970 NA NA NA 4_2021 9550.928
16 395 1_2022 1_2022_170 NA NA NA 1_2022 12077.134
17 502 1_2022 1_2022_175 NA NA NA 1_2022 9334.529
18 552 1_2022 1_2022_NA NA NA NA 1_2022 7856.897
19 613 1_2022 1_2022_177 NA NA NA 1_2022 5938.855
20 445 4_2022 4_2022_NA NA NA NA 4_2022 10645.092
21 434 4_2022 4_2022_987 NA NA NA 4_2022 7247.593
22 373 4_2022 4_2022_NA NA NA NA 4_2022 5933.495
23 601 4_2022 4_2022_NA NA NA NA 4_2022 5663.951
24 50 4_2022 4_2022_991 NA NA NA 4_2022 5373.542
25 399 4_2022 4_2022_NA NA NA NA 4_2022 5272.994
26 517 4_2022 4_2022_NA NA NA NA 4_2022 5232.911
# Next we can check how close to the bottom our average is. Before doing that, check depth again.
d_without_key %>%
mutate(depth2 = ifelse(bottom_depth > 250, "deeper than 250 m", "normal")) %>%
ggplot(aes(lon, lat, color = as.factor(year), shape = as.factor(quarter))) +
geom_point() +
facet_wrap(~ depth2, ncol = 1) +
coord_sf()d_without_key %>%
mutate(depth2 = ifelse(bottom_depth > 250, "deeper than 250 m", "normal")) %>%
group_by(year, depth2) %>%
summarise(n = n()) %>%
ggplot(aes(year, n, fill = depth2)) +
geom_bar(stat = "identity")# I am going to assume that if depth is deeper than 250, I divide by 10, because in script 01, this gives a very good fit to raster-derived depth data
d_without_key %>%
drop_na(depth) %>% # These are the ones without any depth info
mutate(bottom_depth2 = ifelse(bottom_depth > 250, bottom_depth/10, bottom_depth),
depth_diff = bottom_depth2 - depth) %>%
ggplot(aes(smhi_serial_no_all, depth_diff)) +
geom_point()# Interesting... Here it seems that the depth difference is more spread around 0, as we would expect.Join and save these data:
all_combined_ctd_trawl_data <- bind_rows(d_with_key %>% select(oxy, depth, temp, haul_id),
d_without_key %>% select(oxy, depth, temp, haul_id)) %>%
drop_na(oxy, temp)
str(all_combined_ctd_trawl_data)'data.frame': 326 obs. of 4 variables:
$ oxy : num 2.3 3.55 4.46 5.45 6.02 ...
$ depth : num 58 47.6 37.7 38.1 43.1 ...
$ temp : num 9.29 11.19 10.76 7.77 5.66 ...
$ haul_id: chr "2015_4_2" "2015_4_4" "2015_4_6" "2015_4_8" ...
write_csv(all_combined_ctd_trawl_data, paste0(home, "/data/clean/ctd_trawl_joined.csv"))
# Final check on how many stomachs and hauls that can't be matched
d_all <- read_csv(paste0(home, "/data/clean/aggregated_stomach_data.csv"))
nrow(d_all)[1] 9978
d_all <- d_all %>% left_join(all_combined_ctd_trawl_data, by = "haul_id")
names(d_all) [1] "pred_id" "amphipoda_tot" "bivalvia_tot"
[4] "clupeidae_tot" "clupea_harengus_tot" "gadus_morhua_tot"
[7] "gobiidae_tot" "mysidae_tot" "non_bio_tot"
[10] "other_crustacea_tot" "other_tot" "other_pisces_tot"
[13] "platichthys_flesus_tot" "polychaeta_tot" "saduria_entomon_tot"
[16] "sprattus_sprattus_tot" "predator_latin_name" "species"
[19] "pred_weight_g" "pred_length_cm" "year"
[22] "quarter" "month" "ices_rect"
[25] "subdiv" "haul_id" "smhi_serial_no"
[28] "X" "Y" "lat"
[31] "lon" "fle_kg_km2" "mcod_kg_km2"
[34] "scod_kg_km2" "date" "bottom_depth"
[37] "group" "depth.x" "density_saduria"
[40] "oxy.x" "oxy.y" "depth.y"
[43] "temp"
# 13 % of all stomachs
d_all %>% drop_na(oxy.y)# A tibble: 8,943 × 43
pred_id amphipoda_tot bivalvia_tot clupeidae_tot clupea_harengus_tot
<chr> <dbl> <dbl> <dbl> <dbl>
1 2015_4_COD_1 0 0 0 0
2 2015_4_COD_101 0 0 0 0
3 2015_4_COD_103 0 0 0 0
4 2015_4_COD_104 0 0 0 0
5 2015_4_COD_106 0 0 0 0
6 2015_4_COD_108 0 0 0 0
7 2015_4_COD_11 0 0 0 0
8 2015_4_COD_111 0 0 1.74 0
9 2015_4_COD_113 0 0 0 0
10 2015_4_COD_114 0.04 0 0 0
# ℹ 8,933 more rows
# ℹ 38 more variables: gadus_morhua_tot <dbl>, gobiidae_tot <dbl>,
# mysidae_tot <dbl>, non_bio_tot <dbl>, other_crustacea_tot <dbl>,
# other_tot <dbl>, other_pisces_tot <dbl>, platichthys_flesus_tot <dbl>,
# polychaeta_tot <dbl>, saduria_entomon_tot <dbl>,
# sprattus_sprattus_tot <dbl>, predator_latin_name <chr>, species <chr>,
# pred_weight_g <dbl>, pred_length_cm <dbl>, year <dbl>, quarter <dbl>, …
# 9% % of all hauls
d_all %>% distinct(haul_id, .keep_all = TRUE) %>% drop_na(oxy.y)# A tibble: 322 × 43
pred_id amphipoda_tot bivalvia_tot clupeidae_tot clupea_harengus_tot
<chr> <dbl> <dbl> <dbl> <dbl>
1 2015_4_COD_1 0 0 0 0
2 2015_4_COD_101 0 0 0 0
3 2015_4_COD_138 0 0 0 0
4 2015_4_COD_169 0 0 0 8.22
5 2015_4_COD_249 0 0 0 0
6 2015_4_COD_335 0 0 0 0
7 2015_4_COD_342 0 0 0 0
8 2015_4_COD_379 0 0 0 0
9 2015_4_COD_410 0 0 0 0
10 2015_4_COD_440 0 0 0 0
# ℹ 312 more rows
# ℹ 38 more variables: gadus_morhua_tot <dbl>, gobiidae_tot <dbl>,
# mysidae_tot <dbl>, non_bio_tot <dbl>, other_crustacea_tot <dbl>,
# other_tot <dbl>, other_pisces_tot <dbl>, platichthys_flesus_tot <dbl>,
# polychaeta_tot <dbl>, saduria_entomon_tot <dbl>,
# sprattus_sprattus_tot <dbl>, predator_latin_name <chr>, species <chr>,
# pred_weight_g <dbl>, pred_length_cm <dbl>, year <dbl>, quarter <dbl>, …
Final conclusions:
- Not all trawl data have a CTD measurement, we’d need to discard stomach data (8% of hauls minimum, likely more), and some CTD measurements are nonsens
- It is extremely time-consuming to prepare the data, essentially it needs to be done by each survey, and sometimes within each file (station) within a survey
- At which depths do we summarise oxygen data? We cannot no for sure it’s the bottom, and the data from the trawl and the CTD make it very clear the deepest recording is not always the bottom
- How far from trawl locaiton is CTD ok to use?